#ifdef __sw_slave__
#include <simd.h>
#include <slave.h>

#include "bitonic_blocks.h"
void my_print_int256(int256 v) {
  long *vp = &v;
  printf("[%ld %ld %ld %ld]", vp[0], vp[1], vp[2], vp[3]);
}
void swap(long *nums, int i, int j) {
  long tmp = nums[i];
  nums[i] = nums[j];
  nums[j] = tmp;
}
void reverse(long *nums, int begin, int end) {
  end--;
  while (begin < end) {
    swap(nums, begin++, end--);
  }
}
int next_permutation(long *arr, int n) {
  // From right to left, find the first digit(partitionNumber)
  // which violates the increase trend
  int p = n - 2;
  while (p > -1 && arr[p] >= arr[p + 1])
    --p;

  // If not found, which means current sequence is already the largest
  // permutation, then rearrange to the first permutation and return false
  if (p == -1) {
    reverse(arr, 0, n);
    return 0;
  }

  // From right to left, find the first digit which is greater
  // than the partition number, call it changeNumber
  int c = n - 1;
  while (c > 0 && arr[c] <= arr[p])
    --c;

  // Swap the partitionNumber and changeNumber
  swap(arr, p, c);
  // Reverse all the digits on the right of partitionNumber
  reverse(arr, p + 1, n);
  return 1;
}
void bitonic_sort_inc_16(long *arr) {
  AUX_FLIP_DEF;
  int256 v0, v1, v2, v3;
  simd_load(v0, arr + 0);
  simd_load(v1, arr + 4);
  simd_load(v2, arr + 8);
  simd_load(v3, arr + 12);
  BITONIC_VEC_INIT_8(v0, v1);
  BITONIC_INC_8(v0, v1);

  BITONIC_VEC_INIT_8(v2, v3);
  BITONIC_DEC_8(v2, v3);
  BITONIC_INC_4V(v0, v1, v2, v3);
  BITONIC_INC_8(v0, v1);
  BITONIC_INC_8(v2, v3);
  simd_store(v0, arr + 0);
  simd_store(v1, arr + 4);
  simd_store(v2, arr + 8);
  simd_store(v3, arr + 12);
}
void bitonic_sort_dec_16(long *arr) {
  AUX_FLIP_DEF;
  int256 v0, v1, v2, v3;
  simd_load(v0, arr + 0);
  simd_load(v1, arr + 4);
  simd_load(v2, arr + 8);
  simd_load(v3, arr + 12);
  BITONIC_VEC_INIT_8(v0, v1);
  BITONIC_INC_8(v0, v1);

  BITONIC_VEC_INIT_8(v2, v3);
  BITONIC_DEC_8(v2, v3);
  BITONIC_DEC_4V(v0, v1, v2, v3);
  BITONIC_DEC_8(v0, v1);
  BITONIC_DEC_8(v2, v3);
  simd_store(v0, arr + 0);
  simd_store(v1, arr + 4);
  simd_store(v2, arr + 8);
  simd_store(v3, arr + 12);
}
void bitonic_dec_16(long *arr) {
  AUX_FLIP_DEF;
  int256 v0, v1, v2, v3;
  simd_load(v0, arr + 0);
  simd_load(v1, arr + 4);
  simd_load(v2, arr + 8);
  simd_load(v3, arr + 12);
  BITONIC_DEC_4V(v0, v1, v2, v3);
  BITONIC_DEC_8(v0, v1);
  BITONIC_DEC_8(v2, v3);
  simd_store(v0, arr + 0);
  simd_store(v1, arr + 4);
  simd_store(v2, arr + 8);
  simd_store(v3, arr + 12);
}
void bitonic_inc_16(long *arr) {
  AUX_FLIP_DEF;
  int256 v0, v1, v2, v3;
  simd_load(v0, arr + 0);
  simd_load(v1, arr + 4);
  simd_load(v2, arr + 8);
  simd_load(v3, arr + 12);
  BITONIC_INC_4V(v0, v1, v2, v3);
  BITONIC_INC_8(v0, v1);
  BITONIC_INC_8(v2, v3);
  simd_store(v0, arr + 0);
  simd_store(v1, arr + 4);
  simd_store(v2, arr + 8);
  simd_store(v3, arr + 12);
}
void bitonic_sort_inc_32(long *arr, int n) {
  AUX_FLIP_DEF;
  bitonic_sort_inc_16(arr);
  bitonic_sort_dec_16(arr + 16);
  for (int i = 0; i < 16; i += 8) {
    int256 v0, v1, v2, v3;
    simd_load(v0, arr + i + 0);
    simd_load(v1, arr + i + 4);
    simd_load(v2, arr + i + 16);
    simd_load(v3, arr + i + 20);
    BITONIC_INC_4V(v0, v1, v2, v3);
    simd_store(v0, arr + i + 0);
    simd_store(v1, arr + i + 4);
    simd_store(v2, arr + i + 16);
    simd_store(v3, arr + i + 20);
  }
  bitonic_inc_16(arr);
  bitonic_inc_16(arr + 16);
}
void bitonic_sort_dec_32(long *arr, int n) {
  AUX_FLIP_DEF;
  bitonic_sort_inc_16(arr);
  bitonic_sort_dec_16(arr + 16);
  for (int i = 0; i < 16; i += 8) {
    int256 v0, v1, v2, v3;
    simd_load(v0, arr + i + 0);
    simd_load(v1, arr + i + 4);
    simd_load(v2, arr + i + 16);
    simd_load(v3, arr + i + 20);
    BITONIC_DEC_4V(v0, v1, v2, v3);
    simd_store(v0, arr + i + 0);
    simd_store(v1, arr + i + 4);
    simd_store(v2, arr + i + 16);
    simd_store(v3, arr + i + 20);
  }
  bitonic_inc_16(arr);
  bitonic_inc_16(arr + 16);
}
void bitonic_sort(long *arr, int n) {
  if (n < 32) {
    for (int i = n; i < 32; i ++) {
      arr[i] = 0x7FFFFFFFFFFFFFFFL;
    }
    bitonic_sort_32(arr);
    return;
  }
  if (n < 64) {
    for (int i = n; i < 64; i ++) {
      arr[i] = 0x7FFFFFFFFFFFFFFFL;
    }
    bitonic_sort_64(arr);
    return;
  }
  AUX_FLIP_DEF;
  // long ed, st;
  // asm volatile("rcsr %0, 4\n\t" : "=r"(st));
  
  int npad = 128;
  while (npad < n)
    npad <<= 1;
  
  for (int i = 0; i < npad; i += 128) {
    bitonic_sort_64_inc(arr + i + 0);
    bitonic_sort_64_dec(arr + i + 64);
  }
  for (int r0s = 7; (1 << r0s) <= npad; r0s++) {
    int r0 = 1 << r0s;

    for (int iout = 0; iout < npad; iout += r0) {
      // printf("iout: %d\n", iout);
      if (iout >> r0s & 1) {
        for (int r1 = r0 >> 1; r1 >= 64; r1 >>= 1) {
          for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
            // printf("iin, dec: %d\n", iin);
            for (int i = iin; i < iin + r1; i += 8) {
              int256 v0, v1, v2, v3;
              simd_load(v0, arr + i + 0);
              simd_load(v1, arr + i + 4);
              simd_load(v2, arr + i + r1 + 0);
              simd_load(v3, arr + i + r1 + 4);
              BITONIC_DEC_4V(v0, v1, v2, v3);
              simd_store(v0, arr + i + 0);
              simd_store(v1, arr + i + 4);
              simd_store(v2, arr + i + r1 + 0);
              simd_store(v3, arr + i + r1 + 4);
            }
          }
        }
        for (int i = iout; i < iout + r0; i += 64)
          bitonic_64_dec(arr + i);
      } else {
        for (int r1 = r0 >> 1; r1 >= 64; r1 >>= 1) {
          for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
            // printf("iin, inc: %d\n", iin);
            for (int i = iin; i < iin + r1; i += 8) {
              int256 v0, v1, v2, v3;
              simd_load(v0, arr + i + 0);
              simd_load(v1, arr + i + 4);
              simd_load(v2, arr + i + r1 + 0);
              simd_load(v3, arr + i + r1 + 4);
              BITONIC_INC_4V(v0, v1, v2, v3);
              simd_store(v0, arr + i + 0);
              simd_store(v1, arr + i + 4);
              simd_store(v2, arr + i + r1 + 0);
              simd_store(v3, arr + i + r1 + 4);
            }
          }
        }
        for (int i = iout; i < iout + r0; i += 64) {
          // printf("bitinc: %d\n", i);
          bitonic_64_inc(arr + i);
        }
      }
    }
  }
  // asm volatile("rcsr %0, 4\n\t" : "=r"(ed));
  // printf("%ld\n", ed - st);
}
void test() {
  if (!_MYID) {
    AUX_FLIP_DEF;
    volatile long *ticks = 2000;
    int256 f1v4 = simd_set_int256(0x3ff0000000000000L, 0x3ff0000000000000L, 0x3ff0000000000000L, 0x3ff0000000000000L);
    int256 flip0110 = simd_set_int256(0L << 63, 1L << 63, 1L << 63, 0L << 63);
    int256 flip1001 = simd_set_int256(1L << 63, 0L << 63, 0L << 63, 1L << 63);
    int256 flip1100 = simd_set_int256(1L << 63, 1L << 63, 0L << 63, 0L << 63);
    int256 flip0011 = simd_set_int256(0L << 63, 0L << 63, 1L << 63, 1L << 63);
    int256 flip0101 = simd_set_int256(0L << 63, 1L << 63, 0L << 63, 1L << 63);
    int256 flip1010 = simd_set_int256(1L << 63, 0L << 63, 1L << 63, 0L << 63);
    int256 f1_0110 = simd_set_int256(0x3ffL << 52, 0xbffL << 52, 0xbffL << 52, 0x3ffL << 52);
    int256 f1_1001 = simd_set_int256(0xbffL << 52, 0x3ffL << 52, 0x3ffL << 52, 0xbffL << 52);
    int256 f1_1100 = simd_set_int256(0xbffL << 52, 0xbffL << 52, 0x3ffL << 52, 0x3ffL << 52);
    int256 f1_0011 = simd_set_int256(0x3ffL << 52, 0x3ffL << 52, 0xbffL << 52, 0xbffL << 52);
    int256 f1_0101 = simd_set_int256(0x3ffL << 52, 0xbffL << 52, 0x3ffL << 52, 0xbffL << 52);
    int256 f1_1010 = simd_set_int256(0xbffL << 52, 0x3ffL << 52, 0xbffL << 52, 0x3ffL << 52);
    // int256 aux = simd_set_int256(1L << 63, 1L << 63, 1L << 63, 1L << 63);
    int256 flip[16];
    for (long i = 0; i < 16; i++) {
      flip[i] = simd_set_int256(i << 63, i >> 1 << 63, i >> 2 << 63, i >> 3 << 63);
    }
    int256 tests[24];
    int256 l4v4 = simd_set_int256(4, 4, 4, 4);

    tests[23] = simd_set_int256(3, 2, 1, 0);
    tests[22] = simd_set_int256(2, 3, 1, 0);
    tests[21] = simd_set_int256(3, 1, 2, 0);
    tests[20] = simd_set_int256(1, 3, 2, 0);
    tests[19] = simd_set_int256(2, 1, 3, 0);
    tests[18] = simd_set_int256(1, 2, 3, 0);
    tests[17] = simd_set_int256(3, 2, 0, 1);
    tests[16] = simd_set_int256(2, 3, 0, 1);
    tests[15] = simd_set_int256(3, 0, 2, 1);
    tests[14] = simd_set_int256(0, 3, 2, 1);
    tests[13] = simd_set_int256(2, 0, 3, 1);
    tests[12] = simd_set_int256(0, 2, 3, 1);
    tests[11] = simd_set_int256(3, 1, 0, 2);
    tests[10] = simd_set_int256(1, 3, 0, 2);
    tests[9] = simd_set_int256(3, 0, 1, 2);
    tests[8] = simd_set_int256(0, 3, 1, 2);
    tests[7] = simd_set_int256(1, 0, 3, 2);
    tests[6] = simd_set_int256(0, 1, 3, 2);
    tests[5] = simd_set_int256(2, 1, 0, 3);
    tests[4] = simd_set_int256(1, 2, 0, 3);
    tests[3] = simd_set_int256(2, 0, 1, 3);
    tests[2] = simd_set_int256(0, 2, 1, 3);
    tests[1] = simd_set_int256(1, 0, 2, 3);
    tests[0] = simd_set_int256(0, 1, 2, 3);
    asm("wcsr %0, 0x80" ::"r"(0x6c));
    // asm("wcsr %0, 0x81" ::"r"(0x9c));
    // long a[16];
    // for (int i = 0; i < 16; i ++) {
    //   a[i] = i;
    // }
    long p = 11;
    long cyc = 0;
    #define N 256
    for (int t = 0; t < 1000; t++) {
      long a[N];
      for (int i = 0; i < N; i++) {
        p = (p * 5 + 3) % 10000007;
        a[i] = p & 31;
        // printf("%ld ", a[i]);
      }
      // puts("");
      long st, ed;
      asm volatile("rcsr %0, 4\n\t" : "=r"(st));
      bitonic_sort(a, N);
      // bitonic_sort(a, 64);
      asm volatile("rcsr %0, 4\n\t" : "=r"(ed));
      cyc += ed - st;
      // printf("%ld\n", ed - st);
      // for (int i = 0; i < N; i++) {
      //   printf("%ld ", a[i]);
      // }
      // puts("");
      // printf("%ld %ld %ld %ld\n", a[0], a[1], a[2], a[3]);
      // int256 v0, v1;
      // simd_load(v0, a);
      // simd_load(v1, a + 4);
      // my_print_int256(v0);
      // my_print_int256(v1);

      // BITONIC_VEC_INIT_8(v0, v1);
      // printf("->");
      // my_print_int256(v0);
      // my_print_int256(v1);
      // BITONIC_DEC_8(v0, v1);
      // printf("->");
      // my_print_int256(v0);
      // my_print_int256(v1);
      // puts("");
    }
    printf("%ld\n", cyc);
    //   for (int i = 0; i < 1; i++) {
    //     for (int j = 0; j < 24; j++) {
    //       int256 v0 = tests[i] + l4v4;
    //       int256 v1 = tests[j];

    //       my_print_int256(v0);
    //       my_print_int256(v1);
    //       long st, ed;
    //       BITONIC_INIT_8_3r(v0, v1);
    //       puts("");
    //       // asm volatile("ldl %0, 2000($31)\n\t"
    //       //              : "=r"(st));
    //       // asm volatile("ldl %0, 2008($31)\n\t"
    //       //              : "=r"(ed));
    //       // printf("-%d->", ed - st);

    //       // my_print_int256(v0);
    //       // my_print_int256(v1);
    //       // printf("->");
    //       // BITONIC_INC_8_3r(v0, v1);
    //       // my_print_int256(v0);
    //       // my_print_int256(v1);
    //       // puts("");
    //     }
    //   }
  }
}
#endif
#ifdef __sw_host__
// #include <qthread.h>
extern void slave_test();
int main() {
  qthread_init();
  qthread_spawn(slave_test, 0);
  qthread_join();
  //athread_halt();
}
#endif